;; tokenize.scm: .thrift tokenization routines for r6rs-thrift
;; Copyright (C) 2012 Julian Graham

;; r6rs-thrift is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.

;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with this program.  If not, see <http://www.gnu.org/licenses/>.

#!r6rs

(library (thrift compile tokenize)
  (export thriftc:make-tokenizer

	  thriftc:make-lexical-token
	  thriftc:lexical-token-category
	  thriftc:lexical-token-source
	  thriftc:lexical-token-value

	  thriftc:source-location-line
	  thriftc:source-location-column)
  (import (rnrs))

  (define-record-type (thriftc:lexical-token 
		       thriftc:make-lexical-token 
		       thriftc:lexical-token?)
    (fields category source value))
  
  (define-record-type (thriftc:source-location 
		       thriftc:make-source-location 
		       thriftc:source-location?)
    (fields input line column offset))

  (define-condition-type &thriftc:source-location &condition 
    thriftc:make-source-location-condition thriftc:source-location-condition?
    (source-location thriftc:source-location-condition-location))

  (define (thriftc:make-tokenizer port)
    (define line 0)
    (define num-lines 0)
    (define column 0)
    (define offset 0)  
    (define lines (list))

    (define buffer (list))

    (define (lexer:read-char)
      (if (null? buffer)
	  (let ((c (read-char port)))
	    (or (eof-object? c)
		(begin
		  (if (char=? c #\newline)
		      (begin
			(set! line (+ line 1))
			(if (> line num-lines)
			    (begin (if (null? lines)
				       (set! lines (list column))
				       (set! lines 
					     (append lines (list column))))
				   (set! num-lines (+ num-lines 1))))
			(set! column 0))
		      (set! column (+ column 1)))
		  (set! offset (+ offset 1))))
	    c)
	  (let ((c (car buffer)))
	    (set! buffer (cdr buffer))
	    c)))
  
    (define (lexer:peek-char) (if (null? buffer) (peek-char port) (car buffer)))
    
    (define (lexer:unread-char c)
      (set! offset (- offset 1))
      (if (char? c)
	  (begin 
	    (if (char=? c #\newline)
		(begin (set! line (- line 1))
		       (set! column (list-ref lines line))))
	    (set! buffer (cons c buffer)))))

    (define (make-token category . value)
      (thriftc:make-lexical-token 
       category 
       (thriftc:make-source-location port line column offset) 
       (if (null? value) #f (car value))))

    (define (read-string type)
      (define (read-string-inner lst)
	(let ((c (lexer:read-char)))
	  (cond ((or (eof-object? c) (eqv? c #\newline))
		 (raise (condition (make-assertion-violation)
				   (make-message-condition
				    "Unexpected end of string literal."))))

		((eqv? c #\\)
		 (let ((ec (lexer:read-char)))
		   (cond ((eof-object? ec) (list->string (reverse lst)))
			 ((eqv? ec #\newline) (list->string (reverse lst)))
			 ((eqv? ec #\t) (read-string-inner (cons #\tab lst)))
			 ((eqv? ec #\r) (read-string-inner (cons #\return lst)))
			 ((eqv? ec #\n)
			  (read-string-inner (cons #\newline lst)))
			 
			 (else (cons c (read-string type))))))
		
		((char=? c type) (list->string (reverse (cons c lst))))
		(else (read-string-inner (cons c lst))))))
      (read-string-inner '()))

    (define (ident-start-char? chr)
      (or (char-alphabetic? chr) (char=? chr #\_)))
    (define (ident-rest-char? chr)
      (or (ident-start-char? chr) (char=? chr #\.) (char-numeric? chr)))

    (define (read-number chr)
      (define (list->number lst radix) 
	(string->number (list->string lst) radix))

      (define (read-hex)
	(define (read-hex-inner lst)
	  (let ((c (lexer:read-char)))
	    (cond ((eof-object? c) (list->number (reverse lst) 16))
		  ((or (and (char>=? c #\0) (char<=? c #\9))
		       (and (char-ci>=? c #\a) (char-ci<=? c #\f)))
		   (read-hex-inner (cons c lst)))
		  (else (lexer:unread-char c)
			(list->number (reverse lst) 16)))))
	(read-hex-inner '(#\0)))
      
      (define (read-number-inner lst seen-dot? seen-e?)
	(let ((c (lexer:read-char)))
	  (cond ((eof-object? c) (list->number (reverse lst) 10))
		((char-numeric? c) 
		 (read-number-inner (cons c lst) seen-dot? seen-e?))
		((char-ci=? c #\e)
		 (if seen-e?
		     (begin (lexer:unread-char c) 
			    (list->number (reverse lst) 10))
		     (read-number-inner (cons c lst) #t #t)))
		((char=? c #\.) 
		 (if (or seen-dot? seen-e?)
		     (begin (lexer:unread-char c)
			    (list->number (reverse lst) 10))
		     (read-number-inner (cons c lst) #t #f)))
		(else (begin (lexer:unread-char c)
			     (list->number (reverse lst) 10))))))

      (if (eqv? chr #\0)
	  (let ((cc (lexer:read-char)))
	    (cond
	     ((char-ci=? cc #\x) (read-hex))
	     ((eqv? cc #\.) (read-number-inner (list #\. #\0) #t #f))
	     (else (read-number-inner (list #\0) #f #f))))
	  (read-number-inner (list chr) #f #f)))

    (define (consume-comments)
      (define (until-newline)
	(let ((c (lexer:read-char)))
	  (or (eof-object? c) (eqv? #\newline c) (until-newline))))

      (define (until-block-close)
	(let ((c (lexer:read-char)))
	  (cond ((eof-object? c)
		 (raise (condition 
			 (make-assertion-violation)
			 (make-message-condition 
			  "Unterminated block comment."))))
		((eqv? c #\*) (if (eqv? #\/ (lexer:peek-char))
				  (lexer:read-char)
				  (until-block-close)))
		(else (until-block-close)))))

      (let ((c (lexer:peek-char)))
	(cond ((eof-object? c) #f)
	      ((eqv? c #\#) (until-newline) #t)
	      ((eqv? c #\/)
	       (lexer:read-char)
	       (let ((c (lexer:peek-char)))
		 (cond ((eof-object? c) #f)
		       ((eqv? c #\/) (until-newline) #t)
		       ((eqv? c #\*) (lexer:read-char) (until-block-close) #t)
		       (else (raise (condition 
				     (make-assertion-violation)
				     (make-message-condition
				      "Malformed block comment.")))))))
	      (else #f))))
	      
    (define (consume-whitespace)
      (let ((c (lexer:peek-char)))
	(or (eof-object? c)
	    (and (char-whitespace? c)
		 (lexer:read-char)
		 (consume-whitespace)))))
    
    (define (read-ident chr)
      (define (read-rest)
	(let ((c (lexer:read-char)))
	  (cond ((eof-object? c) '())
		((ident-rest-char? c) (cons c (read-rest)))
		(else (lexer:unread-char c) '()))))
      (list->string (cons chr (read-rest))))
    
    (define (consume-whitespace-and-comments)
      (consume-whitespace)
      (if (consume-comments) 
	  (consume-whitespace-and-comments)))

    (lambda ()
      (consume-whitespace-and-comments)
      (let ((c (lexer:read-char)))
	(cond ((eof-object? c) '*eoi*)
	      ((eqv? c #\() (make-token 'LPAREN))
	      ((eqv? c #\)) (make-token 'RPAREN))
	      ((eqv? c #\<) (make-token 'LANGLE))
	      ((eqv? c #\>) (make-token 'RANGLE))
	      ((eqv? c #\{) (make-token 'LBRACE))
	      ((eqv? c #\}) (make-token 'RBRACE))
	      ((eqv? c #\[) (make-token 'LBRACK))
	      ((eqv? c #\]) (make-token 'RBRACK))
	      ((eqv? c #\=) (make-token 'EQUAL))
	      ((eqv? c #\:) (make-token 'COLON))
	      ((eqv? c #\;) (make-token 'SEMICOLON))
	      ((eqv? c #\,) (make-token 'COMMA))
	      ((eqv? c #\*) (make-token 'STAR))

	      ((or (eqv? c #\') (eqv? c #\")) 
	       (let* ((s (read-string c))
		      (l (string-length s)))
		 (if (or (eqv? l 0) (not (eqv? (string-ref s (- l 1)) c)))
		     (raise (condition 
			     (make-assertion-violation)
			     (make-message-condition 
			      "Unterminated string literal.")))
		     (make-token 'STRING-LITERAL (substring s 0 (- l 1))))))

	      ((eqv? c #\-)
	       (let ((cc (lexer:read-char)))
		 (if (and (char-ci>=? cc #\0) (char-ci<=? cc #\9))
		     (let ((n (read-number cc)))
		       (make-token (if (exact? n) 
				       'NUM-INTEGER 
				       'NUM-FLOAT) 
				   (- n)))
		     (make-token 'SYMBOL (string c)))))
	      ((char-numeric? c)
	       (let ((n (read-number c)))
		 (make-token (if (exact? n) 'NUM-INTEGER 'NUM-FLOAT) n)))

	      ((ident-start-char? c)
	       (let ((ident (read-ident c)))
		 (cond ((equal? ident "true") (make-token 'TRUE))
		       ((equal? ident "false") (make-token 'FALSE))
		       ((equal? ident "bool") (make-token 'BOOL))
		       ((equal? ident "byte") (make-token 'BYTE))
		       ((equal? ident "binary") (make-token 'BINARY))
		       ((equal? ident "i16") (make-token 'I16))
		       ((equal? ident "i32") (make-token 'I32))
		       ((equal? ident "i64") (make-token 'I64))
		       ((equal? ident "double") (make-token 'DOUBLE))
		       ((equal? ident "string") (make-token 'STRING))
		       ((equal? ident "list") (make-token 'LIST))
		       ((equal? ident "set") (make-token 'SET))
		       ((equal? ident "map") (make-token 'MAP))
		       ((equal? ident "struct") (make-token 'STRUCT))
		       ((equal? ident "union") (make-token 'UNION))
		       ((equal? ident "extends") (make-token 'EXTENDS))
		       ((equal? ident "exception") (make-token 'EXCEPTION))
		       ((equal? ident "typedef") (make-token 'TYPEDEF))
		       ((equal? ident "enum") (make-token 'ENUM))
		       ((equal? ident "senum") (make-token 'SENUM))
		       ((equal? ident "namespace") (make-token 'NAMESPACE))
		       ((equal? ident "include") (make-token 'INCLUDE))
		       ((equal? ident "const") (make-token 'CONST))
		       ((equal? ident "required") (make-token 'REQUIRED))
		       ((equal? ident "optional") (make-token 'OPTIONAL))
		       ((equal? ident "service") (make-token 'SERVICE))
		       ((equal? ident "throws") (make-token 'THROWS))
		       ((equal? ident "void") (make-token 'VOID))
		       ((or (equal? ident "async") 
			    (equal? ident "oneway")) (make-token 'ONEWAY))
		       
		       (else (make-token 'IDENTIFIER ident)))))
	      (else (make-token 'SYMBOL (string c)))))))
)
